# encoding: UTF-8
# user：admin at 2017/3/31
#将糗事百科中的段子抓取下来存入到MongoDB中

import requests
import re
from bs4 import BeautifulSoup
import pymongo
from pymongo import MongoClient

def CrawlerJoke(url,db):
    heards={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"}
    start_html=requests.get(url,headers=heards)

    Soup=BeautifulSoup(start_html.text,'lxml')

    joke_list=Soup.find_all('div',class_="article block untagged mb15")

    for child_joke in joke_list:
        joke_dict={}
        author=child_joke.find("h2").string
        joke_content=child_joke.find('div',class_="content").find_all('span')[0].get_text()
        #print joke_content
        joke_url=child_joke.find('a',class_="contentHerf")["href"]
        joke_url= "http://www.qiushibaike.com"+joke_url

        like=child_joke.find('i',class_="number").get_text()

        if col_exist(db,{"jokc_url":joke_url})==False:
            joke_dict = {"author": author, "content": joke_content, "jokc_url": joke_url, "like": like}
            db.joke.insert(joke_dict)
    #return True

def col_exist(db,query):        #检查是否有存在相同的URL，用来判断重复值
    if db.joke.find(query).count()>0:
        return True
    else:
        return False

if __name__=="__main__":

    dbConf="mongodb://localhost:27017"      #MongoDB的数据库连接
    client=MongoClient(dbConf)
    db=client.test
    for i in range(1,3):
        CrawlerJoke("http://www.qiushibaike.com/text/page/"+str(i),db)